library(tidyverse)
library(corrplot)
library(pheatmap)
library(scales)
library(patchwork)
library(ggcorrplot)
library(ggbiplot)
library(plotly)
library(palmerpenguins)
theme_set(theme_bw())Extended Visualisations In R
RAdelaide 2025
Additional Key Visualisations
Introduction
- Beyond all variations of regression and ggplot visualisations
- Principal Component Analysis
- Correlations
- Heatmaps
Correlations
- Assessing correlations between variables is extremely common
- Interpretation is intuitive for numeric variables
- Not clear for factors \(\implies\) maybe if ordered?
- Pearson correlations for Normally-distributed variables
- Spearman correlations \(\implies\) rank-based correlations
- The function
cor()always returns a symmetric matrix- Usually a diagonal of
1\(\implies\) variables correlation with self
- Usually a diagonal of
Basic Visualisations
- Try checking a few different methods and types
corrplotcan make some helpful & simple visualisationsmethod = c("circle", "square", "ellipse", "number", "shade", "color", "pie")type = c("full", "lower", "upper")
penguin_cor |>
corrplot(
## Hide the diagonal and show measurements in order of clustering
diag = FALSE, order = "hclust",
## Add correlatiion coefficients as integers (cor * 100)
addCoef.col = "grey20", addCoefasPercent = TRUE
)- Can’t be saved as an object \(\implies\) only returns the matrix
Limitations of corrplot
- Because we can’t save figures as an object, becomes difficult
- What if we wanted to split correlations by some category
- The below calculates correlations within each sex
penguins %>% ## Using the magrittr allow the piped data to be accessed using `.`
split(f = .$sex) |> ## Split by sex then find correlations
lapply(dplyr::select, ends_with("_mm"), ends_with("_g")) |>
lapply(dplyr::filter, !if_any(everything(), is.na)) |>
lapply(cor)$female
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
bill_length_mm 1.0000000 -0.4263804 0.5714737 0.5812947
bill_depth_mm -0.4263804 1.0000000 -0.7941673 -0.7477697
flipper_length_mm 0.5714737 -0.7941673 1.0000000 0.8848250
body_mass_g 0.5812947 -0.7477697 0.8848250 1.0000000
$male
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
bill_length_mm 1.0000000 -0.3952939 0.6626541 0.4767611
bill_depth_mm -0.3952939 1.0000000 -0.7162006 -0.7554917
flipper_length_mm 0.6626541 -0.7162006 1.0000000 0.8654987
body_mass_g 0.4767611 -0.7554917 0.8654987 1.0000000
A ggplot-based Alternative
p <- penguin_cor |>
ggcorrplot(
show.diag = FALSE, lab = TRUE,
hc.order = TRUE
)
pDIY Correlation Plots
- Realistically we can do it ourselves
- Gives complete control over everything
- To cluster variables \(\implies\) use
hclust()- Return a vector of variables in order
- How they would appear on a dendrogram
cor_lab <- penguin_cor |>
dist() |> ## hclust needs a distance matrix as input
hclust() |> ## Cluster the variables
pluck("labels") ## Extract the labels element using `pluck()`Now Complete the Figure
p +
scale_fill_gradient2(
low = "darkblue", high = "darkred", limits = c(-1, 1),
labels = percent_format(accuracy = 1)
) +
scale_x_discrete(
expand = expansion(0, 0),
labels = \(x) {
str_remove_all(x, "_[gm]+$") |> str_replace_all("_", " ") |> str_to_title()
}
) +
scale_y_discrete(
expand = expansion(0, 0),
labels = \(x) {
str_remove_all(x, "_[gm]+$") |> str_replace_all("_", " ") |> str_to_title()
}
) +
theme(panel.grid = element_blank(), axis.title = element_blank())Saving As a Function
my_corrplot <- function(x, ...) {
## Get the labels in clustered order
labs <- hclust(dist(x))$labels
## Define a function to tidy labels
my_label_fun <- \(x) str_remove_all(x, "_[gm]+$") |> str_replace_all("_", " ") |> str_to_title()
## Coerce to a tibble
tbl <- x |>
as_tibble(rownames = "x") |>
pivot_longer(contains("_"), names_to = "y", values_to = "Correlation") |>
dplyr::filter(x != y) |>
mutate(across(all_of(c("x", "y")), \(x) fct(x, levels = labs)))
## Create the basic plot
ggplot(tbl, aes(x, fct_rev(y))) + ## Note that fct_rev will change the diagonal
geom_raster(aes(fill = Correlation)) +
geom_label(
aes(label = Correlation),
data = . %>% mutate(Correlation = percent(Correlation, accuracy = 1)),
size = 4, fill = "#FFFFFF99"
) +
scale_fill_gradient2(
low = "darkblue", high = "darkred", limits = c(-1, 1), labels = percent_format(accuracy = 1)
) +
scale_x_discrete(expand = expansion(0, 0), labels = my_label_fun) +
scale_y_discrete(expand = expansion(0, 0), labels = my_label_fun)
}Apply To Each Cohort
- First we’ll split the data again
- Then we’ll create each figure saving as a list of figures
cor_by_sex <- penguins %>%
split(f = .$sex) |>
lapply(dplyr::select, ends_with("_mm"), ends_with("_g")) |>
lapply(dplyr::filter, !if_any(everything(), is.na)) |>
lapply(cor) |>
lapply(my_corrplot)Using Patchwork
- The package
patchworkallows for multi-plot layouts
## Optionally add a title to each individual figure
names(cor_by_sex) |>
lapply(\(x) cor_by_sex[[x]] + ggtitle(x)) |>
## Pass to `patchwork::wrap_plots()` to place into a single figure
wrap_plots() +
## Define the layout and collect any similar axes/legends
plot_layout(guides = "collect", axes = "collect", nrow = 1) +
## Add capital letters to each panel
plot_annotation(tag_levels = "A") &
## Themes can be added to all figures using the `&` as above
theme(
legend.position = "bottom", panel.grid = element_blank(),
axis.title = element_blank()
)Using pheatmap
- A final useful package is
pheatmap- Widely used beyond correlations
- Not
ggplot2based
- Heavily customisable
- Allows for dendrograms on either or both axes
- Can take a bit of fiddling
pheatmap(penguin_cor, display_numbers = TRUE)- Notice the colour gradient is not centred at zero!
- Extremely common issue with
pheatmap()
- Extremely common issue with
Brief Comments
- The above provides multiple strategies for correlations and heatmaps
- The package
ComplexHeatmapis widely used when including annotations- Would take half a day to demonstrate
- Also sets the stage for PCA
Principal Component Analysis
Principal Component Analysis
- PCA is very heavily used across many fields
- Is an unsupervised technique
- Naive to any pre-defined groups or data structures
- PLS is an analogous supervised technique
- Identifies direction(s) of maximal variance in multi-dimensional data
- Successive components are orthogonal
- In 2D super easy to imagine
- Data is rotated around components
Penguins Raw
- Let’s work on the complete (raw) penguins dataset
- 2-additional measurements (\(\Delta N_{15}\) & \(\Delta C_{13}\))
- Slightly different column names
- Can generate identifiers for each set of measurements
glimpse(penguins_raw)Rows: 344
Columns: 17
$ studyName <chr> "PAL0708", "PAL0708", "PAL0708", "PAL0708", "PAL…
$ `Sample Number` <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ Species <chr> "Adelie Penguin (Pygoscelis adeliae)", "Adelie P…
$ Region <chr> "Anvers", "Anvers", "Anvers", "Anvers", "Anvers"…
$ Island <chr> "Torgersen", "Torgersen", "Torgersen", "Torgerse…
$ Stage <chr> "Adult, 1 Egg Stage", "Adult, 1 Egg Stage", "Adu…
$ `Individual ID` <chr> "N1A1", "N1A2", "N2A1", "N2A2", "N3A1", "N3A2", …
$ `Clutch Completion` <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", …
$ `Date Egg` <date> 2007-11-11, 2007-11-11, 2007-11-16, 2007-11-16,…
$ `Culmen Length (mm)` <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34…
$ `Culmen Depth (mm)` <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18…
$ `Flipper Length (mm)` <dbl> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190,…
$ `Body Mass (g)` <dbl> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 34…
$ Sex <chr> "MALE", "FEMALE", "FEMALE", NA, "FEMALE", "MALE"…
$ `Delta 15 N (o/oo)` <dbl> NA, 8.94956, 8.36821, NA, 8.76651, 8.66496, 9.18…
$ `Delta 13 C (o/oo)` <dbl> NA, -24.69454, -25.33302, NA, -25.32426, -25.298…
$ Comments <chr> "Not enough blood for isotopes.", NA, NA, "Adult…
Principal Component Analysis
- PCA expects variables to be on comparable scales
scale. = TRUEwill scale all column variances to 1- This is not
TRUEby default but should be - Will significantly impact results
different scales \(\implies\) bias in variance measures
- This is not
center = TRUEwill centre the data to mean = 0- This is
TRUEby default
- This is
## The most common function for PCA is `prcomp()`
penguin_pca <- prcomp(penguin_mat, scale. = TRUE, center = TRUE)Using broom:tidy()
- My preferred approach is
broom::tidy()
\(\implies\) take control of all plots & questions
broom::tidy(penguin_pca)# A tibble: 1,980 × 3
row PC value
<chr> <dbl> <dbl>
1 PAL0708_N1A2 1 -1.65
2 PAL0708_N1A2 2 -0.0297
3 PAL0708_N1A2 3 0.757
4 PAL0708_N1A2 4 -0.444
5 PAL0708_N1A2 5 0.593
6 PAL0708_N1A2 6 0.294
7 PAL0708_N2A1 1 -1.06
8 PAL0708_N2A1 2 -0.706
9 PAL0708_N2A1 3 0.424
10 PAL0708_N2A1 4 -0.805
# ℹ 1,970 more rows
Visualisation Using ggplot2()
- Now we can tailor our own visualisation
A <- penguin_pca_tbl |>
ggplot(aes(PC1, PC2)) +
geom_point(
aes(colour = Species, shape = Sex)
) +
scale_colour_brewer(
palette = "Set1",
labels = \(x) str_extract(x, "^[:alpha:]+")
) +
scale_shape_manual(
values = c(1, 19), labels = str_to_title
)
AWhat Measurements Drive Each Component?
- Beyond a
biplot\(\implies\) correlations between PCs and measurements - Make a figure with PCs on the x-axis & variables on the y
- Fill a heatmap using correlations
- Best to only use numeric variables
- Will check our categorical variables later
Interactive Plots
- The package
plotlycan transform anyggplotobject into an interactive figureplotly::ggplotly()
ggplotly(
penguin_pca_tbl |>
ggplot(
aes(
PC1, `Flipper Length (mm)`,
## Just try adding random aesthetics to see if
## ggplot can handle them
key1 = studyName, key2 = `Individual ID`
)
) +
geom_point(aes(colour = Species, shape = Sex)) +
scale_colour_brewer(
palette = "Set1", labels = \(x) str_extract(x, "^[:alpha:]+")
) +
scale_shape_manual(values = c(21, 19), labels = str_to_title)
)